***********************************************************
* Internal migration and crime in Brazil *
* Author: Eva-Maria Egger 

* Contact: egger@wider.unu.edu
***********************************************************

* This do-file combines and cleans all data files for analysis.
	
***********************************************************
** Set directories and globals

global tables
global graphs 

***********************************************************
* Data and do-files used:

	*INPUTS:
		*"MR_panel_ALL_FULLt_Dist.dta"
		*"MR_coordinates.dta"
		*"MR_MIV_origin"
		*"MR_allcodes"
		*"CENSO 2010_ind"
		*"MR_distancematrix"
		
	*OUTPUTS: 
		*"MR_analysis.dta"
		*"Individuals_analysis.dta"
		
***********************************************************
** MR level data **
// get distance variables to include in main analysis data set
use "$data\MR_panel_ALL_FULLt_Dist", clear
keep micregion year l_r_inD100_-l_r_inD1500_ MIVE_IND100_-MIVE_IND1500_
tempfile distance
save `distance'

use "$data\MR_panel_ALL_FULLt", clear
*-> merge MR_analysis with MR_coordinates and save as MR_analysis
merge m:1 micregion using "$data\MR_coordinates.dta", nogen keep(1 3)
drop MR pop_size

merge 1:1 micregion year using `distance', nogen keep(1 3)

*keep only relevant variables
keep micregion year homicrate_ r_inD_ l_homicrate_ l_r_inD* MIVE_IND* zMIVE l_population_ population_ DYear uf_year l_gdp_ l_averagemonthlywage l_numemp Mavgwage unemployed_ formal_ lhwage_ ymaleUE_  numemp homic2000  MRIVE_INDall_ MIVE_rob_ uf MIVE_DIST_ id 

*labels
lab var micregion "Microregiao (MR) identifier"
lab var year "Year"
lab var homicrate_ "Homicides per 100,000 inhabitants"
lab var r_inD_ "In-migrants per 100,000 inhabitants"
lab var unemployed_ "Unemployment rate"
lab var lhwage_ "Average log(hourly wage)"
lab var ymaleUE_ "Share of young, male and unemployed"
lab var formal_ "Share of formally employed workers"
lab var Mavgwage "Average monthly manufacturing wage"
lab var uf "State identifier"
lab var population_ "Microregiao population"
lab var homic2000_ "Homicide rate in 2000"
lab var DYear "Analysis years"
lab var MIVE_rob "IV excluding origins with largest Rothemberg weights"
lab var MIVE_INDT_ "IV, weighted using all years"
lab var MIVE_INDt_ "IV, weighted using 2005-2010"
lab var MIVE_INDall_ "IV"
lab var MIVE_DIST_ "IV, weighted using distance" 
lab var MRIVE_INDall_ "IV, regional employment growth"
lab var numemp "Number of formally employed workers"
lab var l_homicrate_ "Log(Homicide rate)"
lab var l_r_inD_ "Log(In-migration rate)"
lab var l_population_ "Log(Population)"
lab var l_averagemonthlywage "Log(Average monthly wage)"
lab var l_numemp "Log(Number of formally employed workers)"
lab var l_gdp_ "Log(local GDP)"
lab var uf_year "State-year fixed effect"
lab var zMIVE "Standardized values of local labor demand shock"
lab var id "Microregiao identifier for coordinates"
forv x=100(100)1500{
	lab var l_r_inD`x'_ "Log(in-migration rate, distance cut-off `x' kilometers)"
	lab var MIVE_IND`x'_ "IV, distance cut-off `x' kilometers"
}

order micregion year l_homicrate_ l_r_inD_ MIVE_INDall_ zMIVE l_population_ population_ uf_year 
sort micregion year 
compress

save "$data\MR_analysis", replace 

********************************************************************************
** Individual level data **
* Get individual migrants matched with local labor demand shocks at origin

use "$data\MR_allcodes", clear
rename micregion orig_mr
sort orig_mr
collapse uf, by(orig_mr)
drop uf
tempfile orig
save `orig'

use "CENSO 2010_ind", clear
g migrant0510 = (born_MC>=2 & time_MC<=4 & orig_mr!=micregion)
g twice_mig = (born_MC>=2 & time_MC<=4 & orig_mr!=reside_MR & orig_mr!=micregion & reside_MR!=micregion)
tab  twice_mig migrant0510, col //to check how many migrants moved also before latest move
g comp_mig = 1 if twice_mig==1
replace comp_mig=0 if migrant0510==1 & comp_mig!=1

	*Describe multiple migrants:
	g high=(educ_level>=3) //high school
g unemployed=(work_emp_main==0)
	* g dummy for being young, male and unemployed
	g ymaleLE= (age>=16 & age<=25 & educ_level<3 & sex==0)
	* g dummy for being young, male, low-educated and unemployed
	g ymaleleUE= (ymaleLE==1 & educ_level<3 & unemployed==1)
	*g dummy for being male, low-educated and unemployed
	g maleleUE= (sex==0 & educ_level<3)
g informality=(informal==2)
	
tabout comp_mig using "$tables/multiple_migrants.txt", c(mean sex mean age mean white mean informality mean ymaleLE mean ymaleleUE mean maleleUE) sum format(2) replace

keep uf_ori uf migrant educ_high work_emp_main informal activ_group lwage_ educ_level comp_mig ///
	youth nonwhite dropouts adequate sex age area orig_mr time_MC micregion

merge m:1 micregion using "$data\MR_allcodes", nogen keep(1 3)
merge m:1 orig_mr using `orig', nogen keep(1 3)
merge m:1 micregion orig_mr using "$data\MR_distancematrix", nogen keep(1 3)

recode migrant 1=0 if micregion==orig_mr
forv d=200(100)2000{
	g migD`d' = cond(migrant==1 & distance>=100 & distance<`d', 1, 0, .)
}
g migD100 = cond(migrant==1 & distance>=0 & distance<=100, 1, 0, .)
g migD2000p = cond(migrant==1 & distance>=2000 & distance!=., 1, 0, .)

* gen high and low skilled workers (with and without college)
g high=cond(educ_level>=3, 1, 0, .)
*gen dummies for variables that shall be aggregated to municipality level, =1 for category we are interested in
g unemployed=cond(work_emp_main==0, 1, 0, .)
g public=cond(informal==1, 1, 0) //formal public job
replace informal=0 if informal==1
replace informal=1 if informal>=2 //informality (including no card, self-employed, small business <5)
g agriculture=cond(activ_group==1, 1, 0, .) //(agriculture)
g lwageH=lwage_ if high==1 //wage for high skilled
g lwageL=lwage_ if high==0 //wage for low skilled
g loweduc=cond(educ_level<=2, 1, 0, .) //low educated (fundamental or middle incomplete)
g housing=cond(adequate>=2, 1, 0, .) //housing is semi- or inadequate (official definition of Census 2010)
g agrilwage=lwage_ if activ_group==1
g ymale = cond(sex==0 & age>=16 & age<=25 & high==0, 1, 0, .)
g young = cond(age>=16 & age<=25, 1, 0, .)

keep micregion migrant migD* area sex age ymale high housing loweduc lwage* agriculture informal youth nonwhite dropouts unemployed public orig_mr time_MC uf_ori comp_mig
sort micregion
g year = 2010
tempfile t1
save `t1'

use "$data\MR_panel_ALL_FULLt.dta", clear
keep micregion year l_population_ region uf unemployed_ informal_ public_ agriculture_ lwage_ loweduc_ high_ ymale_ homicrate_ young_ r_inD_ MIVE_INDall_ youth_ nonwhite_ dropouts_ 
reshape wide l_population_ region uf unemployed_ informal_ public_ agriculture_ lwage_ loweduc_ high_ ymale_ young_ homicrate_ r_inD_ MIVE_INDall_ youth_ nonwhite_ dropouts_ 	, i(micregion) j(year)

tempfile t2
save `t2'
use `t1', clear

merge m:1 micregion using `t2', nogen keep(3)

g agesq = (age)^2 

save "$data\Individuals-MR_IV-weighted_2010", replace
*prep data: merge origin IV with origin_MC of migrants
tempfile miv 
use "$data\MR_MIV_origin", clear
rename orig_mr micregion
	merge m:1 micregion using "$data\MR_allcodes", nogen keep(1 3)
	rename micregion orig_mr
	save "miv", replace

use "$data\Individuals-MR_IV-weighted_2010", clear
keep if migrant==1 
merge m:1 orig_mr using "miv", nogen keep(3)
g young = cond(age>=16 & age<=25, 1, 0, .)
keep o_MIVE_* loweduc sex young ymale comp_mig uf_ori time_MC migD100 migD500 migD1000 migD1500

lab var loweduc "Low education"
lab var sex "Female"
lab var young "Young (16-25 years)"
lab var ymale "Young male (16-25 years)"
lab var comp_mig "Migrants who moved at least one other time within past 5 years"
lab var uf_ori "State of origin"
lab var time_MC "Years since migration"
foreach x in 100 500 1000 1500{
	lab var migD`x' "Migrant, distance cut-off `x' kilometers"
}
forv x=2003/2009{
	lab var o_MIVE_`x' "Origin labor demand shock in year `x'"
}

compress 

save "$data\Individuals_analysis", replace



*
exit

